In [20]:
#plotly is used for interactive web-based visualizations
In [ ]:
#conda install -c plotly plotly
In [6]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
import plotly
import plotly.express as px
import plotly.graph_objects as go
init_notebook_mode(connected=False)
import json
import pandas as pd
import numpy as np
In [7]:
df = pd.read_csv('/Users/minyan/Desktop/Python Project/AB testing_interactive display/Datasets/mobilegames_cookie_cats.csv')
df.head()
Out[7]:
userid version sum_gamerounds retention_1 retention_7
0 116 gate_30 3 False False
1 337 gate_30 38 True False
2 377 gate_40 165 True False
3 483 gate_40 1 False False
4 488 gate_40 179 True True
In [8]:
#check missing values
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB
In [9]:
#list how many null values for each feature
print(df.isnull().sum().sort_values(ascending=False))
userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64
In [10]:
#count the number of players in each group
df.groupby('version').count()
Out[10]:
userid sum_gamerounds retention_1 retention_7
version
gate_30 44700 44700 44700 44700
gate_40 45489 45489 45489 45489
In [11]:
df.groupby('version')['sum_gamerounds'].mean()
Out[11]:
version
gate_30    52.456264
gate_40    51.298776
Name: sum_gamerounds, dtype: float64
In [12]:
df['sum_gamerounds'].describe()
Out[12]:
count    90189.000000
mean        51.872457
std        195.050858
min          0.000000
25%          5.000000
50%         16.000000
75%         51.000000
max      49854.000000
Name: sum_gamerounds, dtype: float64
In [13]:
fig= px.box(df['sum_gamerounds'],y='sum_gamerounds')
#fig.show()

#plot and embed in ipython notebook!
iplot(fig,show_link=False)
In [14]:
#counting the number of players for each of gamerounds
plot_df = df.groupby('sum_gamerounds')['userid'].count()
plot_df
Out[14]:
sum_gamerounds
0        3994
1        5538
2        4606
3        3958
4        3629
         ... 
2294        1
2438        1
2640        1
2961        1
49854       1
Name: userid, Length: 942, dtype: int64
In [15]:
plot_ga=df[df['version']=='gate_30'].groupby('sum_gamerounds')['userid'].count()
plot_gb=df[df['version']=='gate_40'].groupby('sum_gamerounds')['userid'].count()
#bins= [0,10,20,30,40,50,60,70,80,90, 100, 200, 400, 600]
In [16]:
# prepare data
trace1 = go.Histogram(
    x=plot_ga,
    opacity=0.75,
    name = 'gate_30',
    marker = dict(color ='rgba(171,50,97,0.6)'))

trace2 = go.Histogram(
    x=plot_gb,
    opacity=0.75,
    name = 'gate_40',
    marker = dict(color = 'rgba(12,50,196,0.6)'))

da = [trace1, trace2]

lay = go.Layout(barmode = 'overlay',
                title = 'gate_30 vs. gate_40',
                xaxis = dict(title ='Number of players for each of gamerounds'),
                yaxis = dict(title = 'Count'))

fig = go.Figure(data=da, layout=lay)
#fig.show()
iplot(fig, show_link=False)
In [17]:
#plot the distribution of players that played 0 to 100 game rounds
#prepare the dataframe


plot_df = df.groupby('sum_gamerounds')['userid'].count()
da = plot_df[:101]


lay = go.Layout()

fig=px.line(da)

fig.update_layout(title = 'the number of players that played the 0-100 game rounds during the first week',
                  showlegend = False,
                  xaxis = dict(title ='the number of players for each of gamerounds'),
                  yaxis = dict(title = 'Counts')
)

#fig.show()
iplot(fig, show_link=False)
In [18]:
#Null hypothesis: the difference of conversion rate between a/b group is by chance
#Alternative hypothesis: conversion rate of group a is statistical significant larger then group b
In [19]:
#overall one-day retention
#A common metric measuign how fun and engaging a game is 1-day retention
#calculate p_pool

p_pool = df['retention_1'].sum()/df['retention_1'].count()
In [20]:
#A/B test retention for each AB group
df.groupby('version')['retention_1'].mean()
Out[20]:
version
gate_30    0.448188
gate_40    0.442283
Name: retention_1, dtype: float64
In [21]:
#Solution 1:calculate the mean difference
p_diff=df[df['version']== "gate_30"]['retention_1'].mean()-df[df['version']== "gate_40"]['retention_1'].mean()
p_diff
Out[21]:
0.005905169787341458
In [22]:
#calculate pooled standard error 
count1=df[df['version']== "gate_30"]['retention_1'].count()
count2=df[df['version']== "gate_40"]['retention_1'].count()
se_pool = np.sqrt(p_pool*(1-p_pool)*(1/count1+1/count2))
se_pool
Out[22]:
0.0033099127751024513
In [23]:
#for 95% confidence interval the value of Z is 1.96 or we can use pcipy package to calculate it
from scipy.stats import norm
alpha=0.05
z=round(norm.ppf(1-alpha/2),2)
#calculate marginal error
marginal_error = round((z*se_pool),4)
marginal_error

lb=p_diff-marginal_error
ub=p_diff+marginal_error

if lb>0:
    print('Reject null hypothesis.')
else:
    print('Do not reject null hypothesis')
Do not reject null hypothesis
In [24]:
#Solution 2: Bootstrapping: should we be confident in the difference?
#predict the statistics under the null hypothesis
In [32]:
#create a list with bootstrappwd means for each AB-group
boot_1d=[]
for i in range(1000):
    boot_mean=df.sample(frac = 1, replace = True).groupby('version')['retention_1'].mean()
    boot_1d.append(boot_mean)
    
#transform the list to a Dataframe
boot_1d=pd.DataFrame(boot_1d)
print(boot_1d)
version       gate_30   gate_40
retention_1  0.451163  0.444572
retention_1  0.446569  0.441559
retention_1  0.450612  0.444282
retention_1  0.446838  0.447215
retention_1  0.450801  0.444327
...               ...       ...
retention_1  0.449773  0.442837
retention_1  0.450377  0.441742
retention_1  0.449813  0.444208
retention_1  0.446100  0.445676
retention_1  0.449574  0.443191

[1000 rows x 2 columns]
In [33]:
# A kernel Density estimate plot of the boostrap distributiona
# Use distplot for density curve, along with Pandas
import plotly.figure_factory as ff
fig = ff.create_distplot([boot_1d[c] for c in boot_1d.columns], boot_1d.columns,
                          show_rug=False, show_hist=False)
fig.update_layout(title_text = 'A kernel density plot of the boostrap distribution')
#fig.show()
iplot(fig, show_link=False)
In [36]:
#add a column with the difference between AB group
boot_1d['diff'] = (boot_1d.gate_30 - boot_1d.gate_40)/boot_1d.gate_40*100
#plot the bootstrap % difference

da = pd.DataFrame(boot_1d['diff'])
fig = ff.create_distplot( [da[c] for c in da.columns], da.columns,
                          show_rug=False, show_hist=False)
fig.add_shape(type='line',
              x0=p_diff, y0=-0.01, x1=p_diff, y1=0.6,
              line=dict(color='red',width=2)
             )
fig.add_annotation(
                x=p_diff,
                y=0.61,
                showarrow=False,
                text= p_diff)
fig.update_layout(title_text = '%difference in 1-day retention between AB groups',
                 showlegend = False,
                 xaxis = dict(title ='Percentage of Difference'),
                 yaxis = dict(title = 'Density')
                 )
#fig.show()
iplot(fig, show_link=False)
In [37]:
#calculate the probablity that 1-day retention is greater when the gate is at level 30
print((boot_1d['diff']>p_diff).mean())
print('Probablity that 1-day retention is greater than observed difference when the gate is at level 30:', (boot_1d['diff']>p_diff).mean())
0.964
Probablity that 1-day retention is greater than observed difference when the gate is at level 30: 0.964
In [38]:
print('We cant reject Ho because p-value(0.966>0.05), the difference is insignificant.')
We cant reject Ho because p-value(0.966>0.05), the difference is insignificant.
In [ ]: